Overview

This notebook performs differential expression analysis of human endogenous retroviruses (HERVs) across cell-type clusters and brain regions from ASAP PMDBS snRNA-seq data.

Specifically, it: * Defines helper functions for extracting significant genes and visualizing expression (custom getSignName, getAverage, meanPlot_cus, and box_pseudobulk). * Loads TE (HERV) count data generated from trusTEr outputs and organizes them by region and cluster. * Builds DESeq2 models to compare PD vs. control expression across multiple clusters. * Produces summary tables of DE results, mean expression plots, and box plots for selected example HERVs. * Identifies overlaps of upregulated HERVs across cell types and regions using UpSet plots. * Compares in vivo upregulated HERVs in microglia with in vitro stimulation experiments.

Set up

Loading data and needed libraries

library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
##     tapply, union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(data.table)
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:SummarizedExperiment':
## 
##     shift
## The following object is masked from 'package:GenomicRanges':
## 
##     shift
## The following object is masked from 'package:IRanges':
## 
##     shift
## The following objects are masked from 'package:S4Vectors':
## 
##     first, second
library(ggpubr)
## Loading required package: ggplot2
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::between()      masks data.table::between()
## ✖ dplyr::collapse()     masks IRanges::collapse()
## ✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()        masks matrixStats::count()
## ✖ dplyr::desc()         masks IRanges::desc()
## ✖ tidyr::expand()       masks S4Vectors::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::first()        masks data.table::first(), S4Vectors::first()
## ✖ lubridate::hour()     masks data.table::hour()
## ✖ lubridate::isoweek()  masks data.table::isoweek()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ dplyr::last()         masks data.table::last()
## ✖ lubridate::mday()     masks data.table::mday()
## ✖ lubridate::minute()   masks data.table::minute()
## ✖ lubridate::month()    masks data.table::month()
## ✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
## ✖ lubridate::quarter()  masks data.table::quarter()
## ✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()       masks S4Vectors::rename()
## ✖ lubridate::second()   masks data.table::second(), S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice()        masks IRanges::slice()
## ✖ purrr::transpose()    masks data.table::transpose()
## ✖ lubridate::wday()     masks data.table::wday()
## ✖ lubridate::week()     masks data.table::week()
## ✖ lubridate::yday()     masks data.table::yday()
## ✖ lubridate::year()     masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openxlsx)

load("/Volumes/MyPassport/ASAP/code/ASAP_PMDBS_snRNAseq/processing/preprocessed_TE_data.RData")
source("/Volumes/MyPassport/ASAP/code/ASAP_PMDBS_snRNAseq/processing/TE_heatmap_functions.R")
# Load samplesheet
samplesheet <- read.xlsx("/Volumes/MyPassport/ASAP/code/ASAP_PMDBS_snRNAseq/original/ASAP_samplesheet.xlsx")
# I want to split HERV_data$te_counts per region, but first I need to categorize the columns by region
clusters <- colnames(HERV_data$te_counts)[7:ncol(HERV_data$te_counts)]
clusters_per_region <- data.frame(cluster = clusters,
                                  region = str_extract(clusters, "SN|PUT|PFC|AMY")) %>%
  group_by(region) %>%
  summarise(clusters = list(cluster)) %>%
  ungroup()

expressed_hervs_bulkrna <- fread("/Volumes/MyPassport/ASAP/code/ASAP_PMDBS_bulkRNAseq/results/tables/expressed_fl_hervs_expressed.tab", data.table = F, header = F)$V1
# Now we loop through the regions and select only the region's columns
te_counts <- list()
for (i in 1:nrow(clusters_per_region)) {
  region <- clusters_per_region$region[i]
  cluster_cols <- clusters_per_region$clusters[[i]] # these are vectors of column names
  # Subset counts dataframe by these columns and keep Geneid
  te_counts[[region]] <- HERV_data$te_counts %>%
    select(Geneid, all_of(cluster_cols)) %>%
    filter(Geneid %in% expressed_hervs_bulkrna) # Expressed in bulk RNA
}

head(te_counts$SN)

1. Helper Functions

Functions: getSignName(), getAverage(), meanPlot_cus(), pick_colors_meanplot()

Purpose: Extract significantly DE HERVs, compute mean expression per condition, and visualize pairwise mean comparisons.

## getSignName ##
# Get significantly different gene names. 
# Taken from source code of the package deseqAbstraction which is no longer available on github.
# Credits to Per L. Brattås
# Parameters:
# x = results object from deseq
# p = padj threshold for significance
# l = log2FC threshold for significance
getSignName <- function(x,p,l=0) {
  up <- x[!is.na(x$padj) & x$padj < p & x$log2FoldChange > l,]
  down <- x[!is.na(x$padj) & x$padj < p & x$log2FoldChange < -l,]
  return(list(up=rownames(up),down=rownames(down)))
}
## getAverage ##
# Get average expression (normalized by median of ratios) of each of the conditions in a deseq object.
# Taken from source code of the package deseqAbstraction which is no longer available on github.
# Credits to Per L. Brattås
# Parameters:
# dds = deseq object
getAverage <- function(dds) {
  baseMeanPerLvl <- sapply( levels(dds$Dx), function(lvl) rowMeans( counts(dds,normalized=TRUE)[,dds$Dx == lvl] ) )
  baseSDPerLvl <- sapply( levels(dds$Dx), function(lvl) apply( counts(dds,normalized=TRUE)[,dds$Dx == lvl],1,sd ) )
  colnames(baseSDPerLvl) <- paste("st.dev:",colnames(baseSDPerLvl),sep="")
  return(list(Mean=baseMeanPerLvl,SD=baseSDPerLvl))
}

meanPlot_cus <- function(exp,test,c1 = "condition 1",c2 = "condition 2",p=.05,l=0,id=F, ttl="", 
                         repel=TRUE, col1="tomato", col2="darkturquoise", col3="grey", highlights=NA, select_rows = NA){
  if(!all(is.na(select_rows))){
    exp <- exp[select_rows,]
    test <- test[select_rows,]
  }
  
  sign <- getSignName(x = test,p = p,l = l)
  u <- sign$up
  d <- sign$down
  
  #color up and down sign..
  colVec <- ifelse(test = (rownames(exp) %in% u),
                   yes = col1,
                   no = ifelse(test = (rownames(exp) %in% d),
                               yes = col2, no =col3))
  colVec[is.na(colVec)] <- "steelblue" ## if NA make sure it's not counted as <p
  #size of points
  cexVec <- ifelse(test = (rownames(exp) %in% u),
                   yes = 0.35,
                   no = ifelse(test = (rownames(exp) %in% d),
                               yes = 0.35, no = 0.3))
  
  exp_log <- as.data.frame(log2(exp[,c(c1, c2)] + 0.5))
  exp_log$Name <- rownames(exp_log)
  
  exp_log$reg <- factor(ifelse(exp_log$Name %in% u, paste('upregulated in ', c1, ' (', length(u), ')', sep =''),
                               ifelse(exp_log$Name %in% d, paste('downregulated in ', c1,' (', length(d), ')', sep =''), paste('not significant', ' (', (nrow(test) - length(u) - length(d)), ')', sep=''))))
  
  library(ggrepel)
  if(repel == TRUE){
    plt <- ggplot(exp_log, aes(x=get(c2), y=get(c1), label=Name, color=reg)) + geom_point(aes(size=cexVec), alpha=0.7)+ scale_color_manual(values=c(col2, col3, col1))+ scale_size_continuous(range=c(1,2), guide="none")+ geom_text_repel(data = subset(exp_log, Name %in% u | Name %in% d),direction    = "y", nudge_y = 0.4, nudge_x = -0.5)
  }
  else{
    plt <- ggplot(exp_log, aes(x=get(c2), y=get(c1), color=reg)) + geom_point(aes(size=cexVec), alpha=0.7)+ scale_color_manual(values=c(col2, col3, col1))+ scale_size_continuous(range=c(1,2), guide="none")
  }
  plt <- plt + labs(x=paste("log2(mean ",c2,")",sep=""), 
                    y=paste("log2(mean ",c1,")",sep=""),
                    title=paste(ttl, paste(c1," vs. ",c2,sep=""), sep = ': '),
                    subtitle=paste("p-adj < ",p,", log2(fc) > ",l,sep=""))+ theme_bw() +
    theme( plot.title = element_text( size=14, face="bold"), text = element_text(size=15), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), legend.title=element_blank()) 
  
  
  # if(id == T) {
  #   identify(log2(exp[,1]),log2(exp[,2]),labels = rownames(exp))
  # }
  
  if(length(highlights) > 0){
    plt <- plt + geom_point(data=exp_log[highlights,], aes(x=get(c2), y=get(c1)), colour="springgreen3", size=5, shape=1, stroke=2)
  }
  return(plt)
  
}

pick_colors_meanplot <- function(df){
  if(any(df$padj < 0.05 & 
           df$log2FoldChange > 1, na.rm = TRUE)){ # There is upregulated
      if(any(df$padj < 0.05 & 
             df$log2FoldChange < -1, na.rm = TRUE)){ # There is downregulated
        col1 = "tomato"
        col2 = "darkturquoise"
        col3 = "grey"
      }else{ # Upregulated, but not downregulated
        col1 = NA
        col2 = "grey" 
        col3 = "tomato"
      }
    }else{
      if(any(df$padj < 0.05 & # There is downregulated but not upregulated
             df$log2FoldChange < -1, na.rm = TRUE)){ 
        col1 = NA 
        col2 = "darkturquoise"
        col3 = "grey"
      }else{
        col1 = "grey"
        col2 = "grey"
        col3 = "grey"
      }
    }
  return(list("col1" = col1,
              "col2" = col2,
              "col3" = col3))
}

2. Differential Expression Analysis (DEA) with DESeq2

te_dds_list <- list()
te_exp_list <- list()
te_res_list <- list()
te_res_list_df <- list()
regions <- c("SN", "PUT", "PFC", "AMY")
for(cluster in as.character(0:7)){
  for(region in regions){
    print(cluster)
    print(region)
    samplesheet_list <- split(samplesheet, f = samplesheet$Region)
    rownames(samplesheet_list[[region]]) <- samplesheet_list[[region]]$Sample
    samples_cluster <- paste(rownames(samplesheet_list[[region]]), cluster, sep="_")
    samples_cluster <- samples_cluster[which(samples_cluster %in% colnames(te_counts[[region]]))]
    rownames(samplesheet_list[[region]]) <- paste(rownames(samplesheet_list[[region]]), cluster, sep="_")
    # Expressed in snRNA
    te_counts_expressed <- te_counts[[region]][which(rowSums(te_counts[[region]][,samples_cluster]) > 0),]
    print(paste0("Cluster ", cluster, ", region ", region, " expresses ", nrow(te_counts_expressed)))
    
    region_cluster <- paste0(region, "_", cluster)
    te_dds_list[[region_cluster]] <- DESeqDataSetFromMatrix((te_counts_expressed[,samples_cluster]+1), samplesheet_list[[region]][samples_cluster,], design =  ~ Dx)
    te_dds_list[[region_cluster]]$Dx <- relevel(te_dds_list[[region_cluster]]$Dx, "Ctl")
    error_handle <- tryCatch({
      te_dds_list[[region_cluster]] <- DESeq(te_dds_list[[region_cluster]])
    }, error = function(e) {
      print(paste("Something happened with ", region, cluster))
      return(1)
    })
    if(is.numeric(error_handle)){
      if(error_handle == 1){
        te_dds_list[[region_cluster]] <- NA
        te_res_list[[region_cluster]] <- NA
        te_res_list_df[[region_cluster]] <- NA
        te_exp_list[[region_cluster]] <- NA
      }
    }else{
      te_exp_list[[region_cluster]] <- getAverage(te_dds_list[[region_cluster]])
      te_res_list[[region_cluster]] <- lfcShrink(te_dds_list[[region_cluster]], "Dx_PD_vs_Ctl")
      te_res_list_df[[region_cluster]] <- as.data.frame(te_res_list[[region_cluster]])
      te_res_list_df[[region_cluster]]$HERV_id <- rownames(te_res_list_df[[region_cluster]])
    }
  }
}
## [1] "0"
## [1] "SN"
## [1] "Cluster 0, region SN expresses 66"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## Warning in lfproc(x, y, weights = weights, cens = cens, base = base, geth =
## geth, : Estimated rdf < 1.0; not estimating variance
## final dispersion estimates
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "0"
## [1] "PUT"
## [1] "Cluster 0, region PUT expresses 674"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 47 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "0"
## [1] "PFC"
## [1] "Cluster 0, region PFC expresses 724"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 19 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "0"
## [1] "AMY"
## [1] "Cluster 0, region AMY expresses 678"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 54 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "1"
## [1] "SN"
## [1] "Cluster 1, region SN expresses 451"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 82 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "1"
## [1] "PUT"
## [1] "Cluster 1, region PUT expresses 634"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 55 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "1"
## [1] "PFC"
## [1] "Cluster 1, region PFC expresses 699"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 17 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "1"
## [1] "AMY"
## [1] "Cluster 1, region AMY expresses 683"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 48 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "2"
## [1] "SN"
## [1] "Cluster 2, region SN expresses 647"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 86 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "2"
## [1] "PUT"
## [1] "Cluster 2, region PUT expresses 682"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 81 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "2"
## [1] "PFC"
## [1] "Cluster 2, region PFC expresses 672"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 37 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "2"
## [1] "AMY"
## [1] "Cluster 2, region AMY expresses 670"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 78 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "3"
## [1] "SN"
## [1] "Cluster 3, region SN expresses 607"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 83 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "3"
## [1] "PUT"
## [1] "Cluster 3, region PUT expresses 604"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 93 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "3"
## [1] "PFC"
## [1] "Cluster 3, region PFC expresses 601"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 31 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "3"
## [1] "AMY"
## [1] "Cluster 3, region AMY expresses 623"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 75 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "4"
## [1] "SN"
## [1] "Cluster 4, region SN expresses 526"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 59 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "4"
## [1] "PUT"
## [1] "Cluster 4, region PUT expresses 493"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 40 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "4"
## [1] "PFC"
## [1] "Cluster 4, region PFC expresses 460"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 25 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "4"
## [1] "AMY"
## [1] "Cluster 4, region AMY expresses 504"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 35 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "5"
## [1] "SN"
## [1] "Cluster 5, region SN expresses 697"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 53 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "5"
## [1] "PUT"
## [1] "Cluster 5, region PUT expresses 674"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 70 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "5"
## [1] "PFC"
## [1] "Cluster 5, region PFC expresses 659"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 77 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "5"
## [1] "AMY"
## [1] "Cluster 5, region AMY expresses 656"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 86 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "6"
## [1] "SN"
## [1] "Cluster 6, region SN expresses 287"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 39 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "6"
## [1] "PUT"
## [1] "Cluster 6, region PUT expresses 212"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 13 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "6"
## [1] "PFC"
## [1] "Cluster 6, region PFC expresses 314"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 13 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "6"
## [1] "AMY"
## [1] "Cluster 6, region AMY expresses 257"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 21 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "7"
## [1] "SN"
## [1] "Cluster 7, region SN expresses 232"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 23 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "7"
## [1] "PUT"
## [1] "Cluster 7, region PUT expresses 109"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 10 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "7"
## [1] "PFC"
## [1] "Cluster 7, region PFC expresses 62"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 1 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## [1] "7"
## [1] "AMY"
## [1] "Cluster 7, region AMY expresses 107"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 10 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
openxlsx::write.xlsx(te_res_list_df, paste("/Volumes/MyPassport/ASAP/data/ASAP_PMDBS_snRNAseq/results/tables/HERV_PDvsCtl_res.xlsx", sep=""))

3. Mean expression scatter plots

mean_plots_list <- list()
te_res_expressed_list_df <- list()

for(region_cluster in names(te_exp_list)){
    if(!all(is.na(te_res_list[[region_cluster]]))){
      region <- sapply(str_split(region_cluster, "_"), `[[`, 1)
      effect <- unique(samplesheet_list[[region]][which(samplesheet_list[[region]]$Dx != "Ctl"),"Dx"])
      
      colors_list <- pick_colors_meanplot(te_res_list[[region_cluster]])
      mean_plots_list[[region_cluster]] <- meanPlot_cus(exp = te_exp_list[[region_cluster]]$Mean,
                                                        test = te_res_list[[region_cluster]], 
                                                        col1=colors_list$col1, col2=colors_list$col2, col3=colors_list$col3,
                                                        c1 = effect, c2="Ctl", p = 0.05, l=1, ttl = region_cluster, repel = F)
    }
}

# Neurons (cluster 0): 
# Very little number of nuclei in SN in this cluster - ignored
mean_plots_list$PUT_0 
## Warning: Removed 674 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_plots_list$PFC_0 
## Warning: Removed 724 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_plots_list$AMY_0 
## Warning: Removed 678 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Neurons (cluster 1)
mean_plots_list$PUT_1 
## Warning: Removed 634 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_plots_list$PFC_1 
## Warning: Removed 699 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_plots_list$AMY_1 
## Warning: Removed 683 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Astrocytes
mean_plots_list$SN_2 
## Warning: Removed 647 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_plots_list$PUT_2
## Warning: Removed 682 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_plots_list$PFC_2 
## Warning: Removed 672 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_plots_list$AMY_2
## Warning: Removed 670 rows containing missing values or values outside the scale range
## (`geom_point()`).

# OPCs
mean_plots_list$SN_3
## Warning: Removed 607 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_plots_list$PUT_3
## Warning: Removed 604 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_plots_list$PFC_3 
## Warning: Removed 601 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_plots_list$AMY_3
## Warning: Removed 623 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Microglia
mean_plots_list$SN_4
## Warning: Removed 526 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_plots_list$PUT_4
## Warning: Removed 493 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_plots_list$PFC_4 
## Warning: Removed 460 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_plots_list$AMY_4 
## Warning: Removed 504 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Oligodendrocytes
mean_plots_list$SN_5
## Warning: Removed 697 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_plots_list$PUT_5
## Warning: Removed 674 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_plots_list$PFC_5
## Warning: Removed 659 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_plots_list$AMY_5
## Warning: Removed 656 rows containing missing values or values outside the scale range
## (`geom_point()`).

# VLMCs and Tcells skipped as their clusters are too small 

4. Box plots

box_pseudobulk <- function(df, res, gene, log2=F, jitter.height = 0){
  if(!gene %in% rownames(df)){
    return("This gene was not found")
  }
  tmp_microglia_df_melt <- reshape2::melt(df[gene,])
  tmp_microglia_df_melt$sample <- rownames(tmp_microglia_df_melt)
  cluster <- unique(sapply(str_split(rownames(tmp_microglia_df_melt), "_"), tail, 1))
  tmp_samplesheet <- samplesheet
  tmp_samplesheet$sample_cluster <- paste(tmp_samplesheet$Sample, cluster, sep = "_")
  tmp_samplesheet <- tmp_samplesheet[which(tmp_samplesheet$sample_cluster %in% rownames(tmp_microglia_df_melt)),]
  if(nrow(tmp_microglia_df_melt) == nrow(tmp_samplesheet)){
    tmp_microglia_df_melt <- merge(tmp_microglia_df_melt, tmp_samplesheet, by.x="sample", by.y="sample_cluster")
    tmp <- res[gene,]
    tmp$group1 <- "PD"
    tmp$group2 <- "Ctl"
    tmp$`.y.` <- "value"
    tmp$p.format <- format(tmp$pvalue, digits = 3, scientific = T)
    tmp$log2FoldChange <- round(tmp$log2FoldChange, digits = 3)
    # If a p-value is less than 0.05, it is flagged with one star (*). If a p-value is less than 0.01, it is flagged with 2 stars (**). If a p-value is less than 0.001, it is flagged with three stars (***).
    tmp$p.signif <- ifelse(tmp$pvalue < 0.001, "***", 
                           ifelse(tmp$pvalue < 0.01, "**", 
                                  ifelse(tmp$pvalue < 0.05, "*", "ns") ) ) 
    tmp$pformat_log2FC <- paste("pvalue=", tmp$p.format, ";\nlog2FC=", tmp$log2FoldChange, sep="")
    tmp$pformat_log2FC_star <- ifelse(tmp$p.signif != "ns", paste("log2FC=", tmp$log2FoldChange, "\n", tmp$p.signif, sep=""), tmp$p.signif)
    tmp$Dx <- "Dx"
    
    if(log2){
      plt <- ggplot(tmp_microglia_df_melt, aes(x=Dx, y=log2(value+1), fill=Dx)) + 
        geom_jitter(aes(color=Dx), size=1, width = 0.1, height = 0) + ylim(c(min(log2(tmp_microglia_df_melt$value+1)), max(log2(tmp_microglia_df_melt$value+1)) + sd(log2(tmp_microglia_df_melt$value+1)))) + labs(x="Region", y="log2(Normalized Expression)")   +
      stat_pvalue_manual(tmp, y.position = max(log2(tmp_microglia_df_melt$value+1)) + 0.05*(max(log2(tmp_microglia_df_melt$value+1))), label = "pformat_log2FC")
    }else{
      plt <- ggplot(tmp_microglia_df_melt, aes(x=Dx, y=value, fill=Dx)) + 
        geom_jitter(aes(color=Dx), size=1, width = 0.1, height = 0) + 
        ylim(c(min(tmp_microglia_df_melt$value), max(tmp_microglia_df_melt$value) + sd(tmp_microglia_df_melt$value))) + labs(x="Region", y="Normalized Expression")  +
      stat_pvalue_manual(tmp, y.position = max(tmp_microglia_df_melt$value) + 0.05*(max(tmp_microglia_df_melt$value)), label = "pformat_log2FC")
    }
    plt + 
      geom_boxplot(outliers = F, width=0.3, size=0.3, alpha=0.5) + 
      stat_summary( geom = "point", fun.y = "mean",size=2, col = "black", shape = 24) +
      stat_summary(geom = "line", fun.y = mean, group = 1, linetype="dashed") +
      theme_bw() +
      theme(text = element_text(size = 15)) +
      facet_wrap(.~Region, scales = "free", ncol=4) + 
      scale_fill_manual(values = c("Ctl" = "lightgrey", "PD" = "tomato")) +
      scale_color_manual(values = c("Ctl" = "grey", "PD" = "tomato")) + ggtitle(gene)
  }
  
}

herv_counts_norm <- list()
for(region_cluster in names(te_dds_list)){
  herv_counts_norm[[region_cluster]] <- list()
  error_handle <- tryCatch({
      herv_counts_norm[[region_cluster]] <- counts(te_dds_list[[region_cluster]], normalize = T)
    }, error = function(e) {
      print(paste("Something happened with ", region_cluster))
      return(1)
    })
}

samplesheet_list[["SN"]]$sample_cluster <- paste(samplesheet_list$SN$Sample, "4", sep="_")
samplesheet_list[["PUT"]]$sample_cluster <- paste(samplesheet_list$PUT$Sample, "4", sep="_")

plot_list_sn <- list()
plot_list_put <- list()
for (herv in c("HERV_dup880", te_res_list_df$SN_4[which(te_res_list_df$SN_4$padj < 0.05 & te_res_list_df$SN_4$log2FoldChange > 0),"TE_id"])){
  plot_list_sn[[herv]] <- box_pseudobulk(df = herv_counts_norm$SN_4,res =  te_res_list_df$SN_4,gene =  herv, log2=F)
  plot_list_put[[herv]] <- box_pseudobulk(df = herv_counts_norm$PUT_4, te_res_list_df$PUT_4, herv, log2=F)
}
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
box_pseudobulk(df = herv_counts_norm$SN_4, te_res_list_df$SN_4, "HERV_dup880", log2=T)

box_pseudobulk(df = herv_counts_norm$PUT_4, te_res_list_df$PUT_4, "HERV_dup880", log2=T)

box_pseudobulk(df = herv_counts_norm$PFC_4, te_res_list_df$PFC_4, "HERV_dup880", log2=T)

box_pseudobulk(df = herv_counts_norm$AMY_4, te_res_list_df$AMY_4, "HERV_dup880", log2=T)

box_pseudobulk(df = herv_counts_norm$PUT_1, te_res_list_df$PUT_1, "HERV_dup2896", log2=T)

box_pseudobulk(df = herv_counts_norm$PFC_1, te_res_list_df$PFC_1, "HERV_dup2896", log2=T)

box_pseudobulk(df = herv_counts_norm$AMY_1, te_res_list_df$AMY_1, "HERV_dup2896", log2=T)

box_pseudobulk(df = herv_counts_norm$SN_1, te_res_list_df$SN_1, "HERV_dup3355", log2=T)

box_pseudobulk(df = herv_counts_norm$PUT_1, te_res_list_df$PUT_1, "HERV_dup3355", log2=T)

box_pseudobulk(df = herv_counts_norm$PFC_1, te_res_list_df$PFC_1, "HERV_dup3355", log2=T)

box_pseudobulk(df = herv_counts_norm$AMY_1, te_res_list_df$AMY_1, "HERV_dup3355", log2=T)

5. Intersection of upregulated HERVs

get_significant_HERVs <- function(df){
  det <- df[which(df$padj < 0.05 & df$log2FoldChange > 1), "HERV_id"]
  if (length(det) > 0){
    return(det)  
  }else{
    return(NA)
  }
}

library(grid)
library(UpSetR)
celltypes <- c("Exc Neurons", "Inh Neurons", "Astrocytes", "OPCs", "Oligos", "Microglia", "Endothelial", "T cells")

# Astrocytes
upset(fromList(sapply(list("SN_2" = te_res_list_df$SN_2,
                           "PUT_2" = te_res_list_df$PUT_2,
                           "PFC_2" = te_res_list_df$PFC_2,
                           "AMY_2" = te_res_list_df$AMY_2), get_significant_HERVs)))
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
grid.text("Astrocytes", vjust = -17, hjust= -3)

# Microglia
upset(fromList(sapply(list("SN_4" = te_res_list_df$SN_4,
                           "PUT_4" = te_res_list_df$PUT_4,
                           "PFC_4" = te_res_list_df$PFC_4,
                           "AMY_4" = te_res_list_df$AMY_4), get_significant_HERVs)))
grid.text("Microglia", vjust = -17, hjust= -3)

# OPCs
upset(fromList(sapply(list("SN_3" = te_res_list_df$SN_3,
                           "PUT_3" = te_res_list_df$PUT_3,
                           "PFC_3" = te_res_list_df$PFC_3,
                           "AMY_3" = te_res_list_df$AMY_3), get_significant_HERVs)))
grid.text("OPCs", vjust = -17, hjust= -3)

# Oligos
upset(fromList(sapply(list("SN_5" = te_res_list_df$SN_5,
                           "PUT_5" = te_res_list_df$PUT_5,
                           "PFC_5" = te_res_list_df$PFC_5,
                           "AMY_5" = te_res_list_df$AMY_5), get_significant_HERVs)))
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
grid.text("Oligodendrocytes", vjust = -17, hjust= 0)

# Neurons cluster 1
upset(fromList(sapply(list("PUT_1" = te_res_list_df$PUT_1,
                           "PFC_1" = te_res_list_df$PFC_1,
                           "AMY_1" = te_res_list_df$AMY_1), get_significant_HERVs)))
grid.text("Neurons cluster 1", vjust = -17, hjust= -3)

# All in SN
upset(fromList(list(
                    # "OPCs" = get_significant_HERVs(te_res_list_df$SN_3),
                    "Oligos" = get_significant_HERVs(te_res_list_df$SN_5),
                    "Microglia" = get_significant_HERVs(te_res_list_df$SN_4),
                    # "VLMCs" = get_significant_HERVs(te_res_list_df$SN_6),
                    # "T cells" = get_significant_HERVs(te_res_list_df$SN_7),
                    # "Exc Neurons" = get_significant_HERVs(te_res_list_df$SN_0),
                    # "Inh Neurons" = get_significant_HERVs(te_res_list_df$SN_1),
                    "Astrocytes" = get_significant_HERVs(te_res_list_df$SN_2))), 
      sets = c("Oligos", "Microglia", "Astrocytes"), nintersects = 100, nsets = 100, mainbar.y.max = 30) 
grid.text("Substantia nigra", vjust = -17, hjust= 0)

# All in PUT
upset(fromList(list("OPCs" = get_significant_HERVs(te_res_list_df$PUT_3),
                    "Oligos" = get_significant_HERVs(te_res_list_df$PUT_5),
                    "Microglia" = get_significant_HERVs(te_res_list_df$PUT_4),
                    # "VLMCs" = get_significant_HERVs(te_res_list_df$PUT_6),
                    # "T cells" = get_significant_HERVs(te_res_list_df$PUT_7),
               # "Exc Neurons" = get_significant_HERVs(te_res_list_df$PUT_0),
               "Inh Neurons" = get_significant_HERVs(te_res_list_df$PUT_1),
               "Astrocytes" = get_significant_HERVs(te_res_list_df$PUT_2))), sets = c("OPCs", "Oligos", "Microglia", "Inh Neurons", "Astrocytes"), nintersects = 100, nsets = 100, mainbar.y.max = 30) 
grid.text("Putamen", vjust = -17, hjust= 0)

# All in AMY
upset(fromList(list("Exc Neurons" = get_significant_HERVs(te_res_list_df$AMY_0),
                    # "Inh Neurons" = get_significant_HERVs(te_res_list_df$AMY_1),
                    "Astrocytes" = get_significant_HERVs(te_res_list_df$AMY_2)
                    # "OPCs" = get_significant_HERVs(te_res_list_df$AMY_3),
                    # "Oligos" = get_significant_HERVs(te_res_list_df$AMY_5),
                    # "Microglia" = get_significant_HERVs(te_res_list_df$AMY_4),
                    # "VLMCs" = get_significant_HERVs(te_res_list_df$AMY_6),
                    # "T cells" = get_significant_HERVs(te_res_list_df$AMY_7)
                    )), sets = c("Exc Neurons", "Astrocytes"), nintersects = 100, nsets = 100, mainbar.y.max = 30) 
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
grid.text("Amygdala", vjust = -17, hjust= 0)

upset(fromList(list("SN: OPCs" = get_significant_HERVs(te_res_list_df$SN_3),
     "SN: Oligos" = get_significant_HERVs(te_res_list_df$SN_5),
     "SN: Microglia" = get_significant_HERVs(te_res_list_df$SN_4),
     "SN: VLMCs" = get_significant_HERVs(te_res_list_df$SN_6),
     "SN: T cells" = get_significant_HERVs(te_res_list_df$SN_7),
     "SN: Neurons cluster 0" = get_significant_HERVs(te_res_list_df$SN_0),
     "SN: Neurons cluster 1" = get_significant_HERVs(te_res_list_df$SN_1),
     "SN: Astrocytes" = get_significant_HERVs(te_res_list_df$SN_2),
     
     "PUT: OPCs" = get_significant_HERVs(te_res_list_df$PUT_3),
     "PUT: Oligos" = get_significant_HERVs(te_res_list_df$PUT_5),
     "PUT: Microglia" = get_significant_HERVs(te_res_list_df$PUT_4),
     "PUT: VLMCs" = get_significant_HERVs(te_res_list_df$PUT_6),
     "PUT: T cells" = get_significant_HERVs(te_res_list_df$PUT_7),
     "PUT: Neurons cluster 0" = get_significant_HERVs(te_res_list_df$PUT_0),
     "PUT: Neurons cluster 1" = get_significant_HERVs(te_res_list_df$PUT_1),
     "PUT: Astrocytes" = get_significant_HERVs(te_res_list_df$PUT_2),
     
     "PFC: OPCs" = get_significant_HERVs(te_res_list_df$PFC_3),
     "PFC: Oligos" = get_significant_HERVs(te_res_list_df$PFC_5),
     "PFC: Microglia" = get_significant_HERVs(te_res_list_df$PFC_4),
     "PFC: VLMCs" = get_significant_HERVs(te_res_list_df$PFC_6),
     "PFC: T cells" = get_significant_HERVs(te_res_list_df$PFC_7),
     "PFC: Neurons cluster 0" = get_significant_HERVs(te_res_list_df$PFC_0),
     "PFC: Neurons cluster 1" = get_significant_HERVs(te_res_list_df$PFC_1),
     "PFC: Astrocytes" = get_significant_HERVs(te_res_list_df$PFC_2),
     
     "AMY: OPCs" = get_significant_HERVs(te_res_list_df$AMY_3),
     "AMY: Oligos" = get_significant_HERVs(te_res_list_df$AMY_5),
     "AMY: Microglia" = get_significant_HERVs(te_res_list_df$AMY_4),
     "AMY: VLMCs" = get_significant_HERVs(te_res_list_df$AMY_6),
     "AMY: T cells" = get_significant_HERVs(te_res_list_df$AMY_7),
     "AMY: Neurons cluster 0" = get_significant_HERVs(te_res_list_df$AMY_0),
     "AMY: Neurons cluster 1" = get_significant_HERVs(te_res_list_df$AMY_1),
     "AMY: Astrocytes" = get_significant_HERVs(te_res_list_df$AMY_2))), nintersects = 100, nsets = 100, mainbar.y.max = 30) 

6. Intersection with in vitro experiments in microglia

Load HERV upregulation results from IFNγ, TNFα, and PolyIC-stimulated H9 microglia Compare overlaps with in vivo upregulated HERVs in SN and PUT microglia Visualize with UpSet plots and intersections

invitro_microglia <- list()
invitro_microglia[["microglia.H9.IFNg"]] <- read.xlsx("/Volumes/MyPassport/ASAP/code/ASAP_invitro/results/tables/upset_HERV_upreg_microglia_IFNg_TNFa_PolyIC.xlsx", sheet = "microglia.H9.IFNg", colNames = F)
invitro_microglia[["microglia.H9.TNFa"]] <- read.xlsx("/Volumes/MyPassport/ASAP/code/ASAP_invitro/results/tables/upset_HERV_upreg_microglia_IFNg_TNFa_PolyIC.xlsx", sheet = "microglia.H9.TNFa", colNames = F)
invitro_microglia[["microglia.H9.PolyIC"]] <- read.xlsx("/Volumes/MyPassport/ASAP/code/ASAP_invitro/results/tables/upset_HERV_upreg_microglia_IFNg_TNFa_PolyIC.xlsx", sheet = "microglia.H9.PolyIC", colNames = F)

upset(fromList(list("SN_microglia" = te_res_list_df$SN_4[which(te_res_list_df$SN_4$log2FoldChange > 1), "HERV_id"],
"PUT_microglia" = te_res_list_df$PUT_4[which(te_res_list_df$PUT_4$log2FoldChange > 1), "HERV_id"],
"microglia.H9.IFNg" = invitro_microglia[["microglia.H9.IFNg"]]$X1,
"microglia.H9.TNFa" = invitro_microglia[["microglia.H9.TNFa"]]$X1,
"microglia.H9.PolyIC" = invitro_microglia[["microglia.H9.PolyIC"]]$X1)))

intersect(te_res_list_df$PUT_4[which(te_res_list_df$PUT_4$log2FoldChange > 1), "HERV_id"], invitro_microglia[["microglia.H9.IFNg"]]$X1)
## [1] "HERV_dup300"
intersect(te_res_list_df$SN_4[which(te_res_list_df$SN_4$log2FoldChange > 1), "HERV_id"], invitro_microglia[["microglia.H9.IFNg"]]$X1)
## [1] "HERV_dup880"  "HERV_dup2789"

Save functions for L1 DEA

# Save functions in a separate file
dump(
  c("getSignName", "getAverage", 
    "meanPlot_cus", "box_pseudobulk", "pick_colors_meanplot"), 
  file = "TE_DEA_functions.R"
)

saveRDS(herv_counts_norm, "HERV_norm_expression.Rdata")